# <div class="alert alert-block alert-danger">
# <link rel="icon" href="/static/img/favicon-bee.ico">
# </div>
from IPython.display import HTML, display
display(HTML('<link rel="icon" href="/static/img/favicon-bee.ico">'))
import plotly.io as pio
# This ensures Plotly output works in multiple places:
# plotly_mimetype: VS Code notebook UI
# notebook: "Jupyter: Export to HTML" command in VS Code
# See https://plotly.com/python/renderers/#multiple-renderers
pio.renderers.default = "plotly_mimetype+notebook"
'''add this to the html file
<link rel="icon" href="/static/img/favicon-bee.ico">
'''
Data prep
EDA
Regression modellng to estimate how different attributes affect listing price
Identify spatial clusters of listings
Plot clusters on a map
Does being close to the cluster centre affect prices?
Is there a superhost premium? (Do superhosts charge more?)
Are listing prices more expensive on weekends? To what extent? Do we observe the same in all cities?
I used a data set that contains Airbnb listings in 10 European cities, with the following attributes available for each listing:
Data source: Kaggle https://www.kaggle.com/datasets/thedevastator/airbnb-prices-in-european-cities
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
from scipy.stats.stats import pearsonr
Import data
cities_list = ['amsterdam', 'athens', 'barcelona', 'berlin', 'budapest',
'lisbon', 'london', 'paris', 'rome', 'vienna']
weekdays_dict = {}
weekends_dict = {}
for city in cities_list:
df_weekdays = pd.read_csv(f'data/{city}_weekdays.csv')
df_weekends = pd.read_csv(f'data/{city}_weekends.csv')
# city variable
df_weekdays['city'] = city
df_weekends['city'] = city
# dummy variable for weekend
df_weekdays['is_weekend'] = 0
df_weekends['is_weekend'] = 1
df_weekdays['week_part'] = 'weekday'
df_weekends['week_part'] = 'weekend'
weekdays_dict[city] = df_weekdays
weekends_dict[city] = df_weekends
print(city, f'weekdays: {len(df_weekdays)}, weekends: {len(df_weekends)}')
amsterdam weekdays: 1103, weekends: 977 athens weekdays: 2653, weekends: 2627 barcelona weekdays: 1555, weekends: 1278 berlin weekdays: 1284, weekends: 1200 budapest weekdays: 2074, weekends: 1948 lisbon weekdays: 2857, weekends: 2906 london weekdays: 4614, weekends: 5379 paris weekdays: 3130, weekends: 3558 rome weekdays: 4492, weekends: 4535 vienna weekdays: 1738, weekends: 1799
Combine into one dataframe
df = pd.concat(list(weekdays_dict.values())+list(weekends_dict.values())).drop(columns=['Unnamed: 0'])
df['price_per_night'] = df['realSum']/2
averages
df_ = df.groupby('city').agg({'price_per_night':'median', 'room_shared':'mean', 'bedrooms':'mean', 'person_capacity':'mean',
'host_is_superhost':'mean', 'multi':'mean', 'biz':'mean',
'guest_satisfaction_overall':'mean', 'cleanliness_rating':'mean',
'dist':'mean', 'metro_dist':'mean',
'attr_index_norm':'mean', 'rest_index_norm':'mean'}).reset_index().sort_values('price_per_night')
df_
| city | price_per_night | room_shared | bedrooms | person_capacity | host_is_superhost | multi | biz | guest_satisfaction_overall | cleanliness_rating | dist | metro_dist | attr_index_norm | rest_index_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | athens | 63.857709 | 0.002083 | 1.271402 | 3.698106 | 0.428598 | 0.267424 | 0.379924 | 95.003598 | 9.638447 | 1.803080 | 0.478656 | 5.740839 | 9.954268 |
| 4 | budapest | 76.491047 | 0.003481 | 1.105669 | 3.540776 | 0.378916 | 0.303332 | 0.348831 | 94.585281 | 9.477374 | 1.872763 | 0.544059 | 12.675248 | 34.529089 |
| 8 | rome | 91.295911 | 0.001329 | 1.229755 | 3.357372 | 0.326687 | 0.385953 | 0.334109 | 93.122300 | 9.514678 | 3.026982 | 0.819794 | 10.426968 | 25.078056 |
| 3 | berlin | 95.587548 | 0.029388 | 1.070451 | 2.774960 | 0.257246 | 0.276167 | 0.174718 | 94.323671 | 9.461755 | 5.257093 | 0.836064 | 16.803111 | 30.666967 |
| 2 | barcelona | 104.149696 | 0.004236 | 1.161313 | 2.616661 | 0.181433 | 0.385104 | 0.325450 | 91.109072 | 9.291564 | 2.116982 | 0.441248 | 16.636220 | 19.376528 |
| 9 | vienna | 104.247014 | 0.004524 | 1.102347 | 3.350297 | 0.284139 | 0.279050 | 0.339836 | 93.731128 | 9.472434 | 3.139488 | 0.526670 | 8.762474 | 4.239580 |
| 5 | lisbon | 112.687617 | 0.012841 | 1.272428 | 3.343398 | 0.213951 | 0.239459 | 0.587541 | 91.093875 | 9.370640 | 1.966893 | 0.711482 | 7.324730 | 28.274084 |
| 6 | london | 130.647475 | 0.005004 | 1.128790 | 2.846192 | 0.157410 | 0.274992 | 0.387872 | 90.645652 | 9.175023 | 5.326421 | 1.005547 | 20.537398 | 11.234105 |
| 7 | paris | 158.798583 | 0.014055 | 0.972787 | 2.953648 | 0.140700 | 0.219498 | 0.245813 | 92.037530 | 9.263606 | 2.995823 | 0.227323 | 18.204358 | 42.589111 |
| 0 | amsterdam | 230.122091 | 0.004808 | 1.292308 | 2.781731 | 0.284135 | 0.283173 | 0.105288 | 94.514423 | 9.465865 | 2.825052 | 1.089367 | 14.246499 | 26.097566 |
standard deviation
df_ = df.groupby('city').agg({'price_per_night':'std', 'room_shared':'std',
'bedrooms':'std', 'person_capacity':'std',
'host_is_superhost':'std', 'multi':'std', 'biz':'std',
'guest_satisfaction_overall':'std', 'cleanliness_rating':'std',
'dist':'std', 'metro_dist':'std',
'attr_index_norm':'std', 'rest_index_norm':'std'}).reset_index().sort_values('price_per_night')
df_
| city | price_per_night | room_shared | bedrooms | person_capacity | host_is_superhost | multi | biz | guest_satisfaction_overall | cleanliness_rating | dist | metro_dist | attr_index_norm | rest_index_norm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | lisbon | 54.486540 | 0.112596 | 0.728539 | 1.344214 | 0.410128 | 0.426790 | 0.492320 | 9.148114 | 0.924080 | 1.742681 | 0.920204 | 5.082390 | 17.877309 |
| 8 | rome | 59.309052 | 0.036438 | 0.549710 | 1.309052 | 0.469028 | 0.486847 | 0.471704 | 7.815107 | 0.808415 | 1.644095 | 0.631361 | 6.631054 | 13.414188 |
| 4 | budapest | 65.572403 | 0.058903 | 0.663484 | 1.256548 | 0.485177 | 0.459754 | 0.476660 | 6.525680 | 0.842693 | 1.874925 | 0.856410 | 6.672535 | 19.111435 |
| 3 | berlin | 117.664645 | 0.168926 | 0.552033 | 1.188142 | 0.437204 | 0.447191 | 0.379802 | 6.809406 | 0.849384 | 3.692649 | 1.267283 | 10.774273 | 16.634505 |
| 1 | athens | 132.940027 | 0.045600 | 0.652575 | 1.284703 | 0.494922 | 0.442657 | 0.485414 | 8.348637 | 0.839767 | 0.953738 | 0.284154 | 4.667181 | 10.778060 |
| 7 | paris | 165.474872 | 0.117727 | 0.642571 | 1.215007 | 0.347738 | 0.413937 | 0.430601 | 8.818201 | 0.974036 | 1.463542 | 0.122769 | 7.759372 | 15.680438 |
| 2 | barcelona | 177.733944 | 0.064956 | 0.517108 | 1.153124 | 0.385445 | 0.486706 | 0.468625 | 8.607153 | 1.014577 | 1.377859 | 0.284540 | 9.591798 | 10.256285 |
| 9 | vienna | 198.873582 | 0.067115 | 0.602819 | 1.282163 | 0.451067 | 0.448596 | 0.473720 | 7.220808 | 0.855439 | 1.942337 | 0.516132 | 6.259531 | 3.683256 |
| 0 | amsterdam | 215.329203 | 0.069187 | 0.736683 | 1.032634 | 0.451110 | 0.450648 | 0.306999 | 6.350874 | 0.813421 | 2.082573 | 0.831669 | 10.335158 | 17.720931 |
| 6 | london | 235.678632 | 0.070562 | 0.579477 | 1.246235 | 0.364205 | 0.446533 | 0.487289 | 11.510622 | 1.166180 | 2.712573 | 1.263926 | 11.914575 | 6.963803 |
metric_list = ['price_per_night','room_shared','bedrooms','person_capacity',
'host_is_superhost','multi', 'biz', 'guest_satisfaction_overall','cleanliness_rating',
'dist', 'metro_dist', 'attr_index_norm', 'rest_index_norm']
highest = []
lowest = []
for metric in metric_list:
highest.append(df_.sort_values(metric)['city'].values[-1])
lowest.append(df_.sort_values(metric)['city'].values[0])
pd.DataFrame({'metric':metric_list, 'highest':highest, 'lowest':lowest})
| metric | highest | lowest | |
|---|---|---|---|
| 0 | price_per_night | london | lisbon |
| 1 | room_shared | berlin | rome |
| 2 | bedrooms | amsterdam | barcelona |
| 3 | person_capacity | lisbon | amsterdam |
| 4 | host_is_superhost | athens | paris |
| 5 | multi | rome | paris |
| 6 | biz | lisbon | amsterdam |
| 7 | guest_satisfaction_overall | london | amsterdam |
| 8 | cleanliness_rating | london | rome |
| 9 | dist | berlin | athens |
| 10 | metro_dist | berlin | paris |
| 11 | attr_index_norm | london | athens |
| 12 | rest_index_norm | budapest | vienna |
df['host_has_multiple_listings'] = 'single listing'
df.loc[(df['multi']==1)|(df['biz']==1), 'host_has_multiple_listings'] = 'multiple listings'
df['host_is_biz'] = 'Private'
df.loc[df['multi']==1, 'host_is_biz'] = 'Business'
Table above shows that
Athens have the best guest satisfaction and cleanliness ratings, and also the cheapest despite having the highest person capacity.
Amsterdam is the most expensive, presumably because it has the highest average number of bedrooms.
London has the best attraction index but worst guest satisfaction and cleanliness ratings.
fig = go.Figure()
for week_part in ['weekday', 'weekend']:
fig.add_trace(go.Box(y=df[df['week_part']==week_part]['price_per_night'],
x=df[df['week_part']==week_part]['city'],
#boxpoints=False,
name=week_part))# = px.box(df, x='city', y='realSum', color='week_part')
fig.update_layout(
boxmode='group',
font_color="black",
title_font_color="black",
title='Box plot of listing prices<sup><br>Listing prices on weekdays and weekends are similar, except in Amsterdam where prices are higher on weekends. </sup>',
plot_bgcolor='rgba(0, 0, 0, 0)',
yaxis=dict(title='two nights for two (euros)',
range=[0,800],
tickformat=',.0f'),
)
fig.show()
# df_ = df.groupby(['city', 'week_part']).agg({'person_capacity':'mean'}).reset_index()
# fig = go.Figure()
# for week_part in ['weekday', 'weekend']:
# fig.add_trace(go.Bar(y=df_[df_['week_part']==week_part]['person_capacity'],
# x=df_[df_['week_part']==week_part]['city'],
# text=df_[df_['week_part']==week_part]['person_capacity'],
# textposition='outside',
# texttemplate='%{text:.1f}',
# #boxpoints=False,
# name=week_part))# = px.box(df, x='city', y='realSum', color='week_part')
# fig.update_layout(
# boxmode='group', title='Average person capacity', plot_bgcolor='rgba(0, 0, 0, 0)',
# yaxis=dict(title='personal capacity (mean)', tickformat=',.1f', range=[2,4]),
# height=400
# )
# fig.show()
# df_ = df.groupby(['city', 'week_part']).agg({'guest_satisfaction_overall':'mean'}).reset_index()
# fig = go.Figure()
# for week_part in ['weekday', 'weekend']:
# fig.add_trace(go.Bar(y=df_[df_['week_part']==week_part]['guest_satisfaction_overall'],
# x=df_[df_['week_part']==week_part]['city'],
# text=df_[df_['week_part']==week_part]['guest_satisfaction_overall'],
# textposition='outside',
# texttemplate='%{text:.1f}',
# #boxpoints=False,
# name=week_part))# = px.box(df, x='city', y='realSum', color='week_part')
# fig.update_layout(
# boxmode='group', title='Guest satisfaction rating', plot_bgcolor='rgba(0, 0, 0, 0)',
# yaxis=dict(title='personal capacity (mean)', tickformat=',.0f', range=[90,100]
# ),
# height=400
# )
# fig.show()
# df_ = df.groupby(['city', 'week_part']).agg({'host_is_superhost':'mean'}).reset_index()
# fig = go.Figure()
# for week_part in ['weekday', 'weekend']:
# fig.add_trace(go.Bar(y=df_[df_['week_part']==week_part]['host_is_superhost'],
# x=df_[df_['week_part']==week_part]['city'],
# text=df_[df_['week_part']==week_part]['host_is_superhost'],
# textposition='outside',
# texttemplate='%{text:.1f}',
# #boxpoints=False,
# name=week_part))# = px.box(df, x='city', y='realSum', color='week_part')
# fig.update_layout(
# boxmode='group', title='Distance from metro', plot_bgcolor='rgba(0, 0, 0, 0)',
# yaxis=dict(title='metro distance', tickformat=',.1f',# range=[90,100]
# ),
# height=400
# )
# fig.show()
fig = go.Figure()
for biz_status in ['Private', 'Business']:
fig.add_trace(go.Box(y=df[df['host_is_biz']==biz_status]['price_per_night'],
x=df[df['host_is_biz']==biz_status]['city'],
#boxpoints=False,
name=biz_status))# = px.box(df, x='city', y='realSum', color='week_part')
fig.update_layout(
boxmode='group',
font_color="black",
title_font_color="black",
title='Box plot of listing prices - private vs business<sup><br>Listing prices for business hosts are slightly lower than private, especially in Amsterdam. </sup>',
plot_bgcolor='rgba(0, 0, 0, 0)',
yaxis=dict(title='two nights for two (euros)',
range=[0,800],
tickformat=',.0f'),
)
fig.show()
fig = go.Figure()
for biz_status in ['single listing', 'multiple listings']:
fig.add_trace(go.Box(y=df[df['host_has_multiple_listings']==biz_status]['price_per_night'],
x=df[df['host_has_multiple_listings']==biz_status]['city'],
#boxpoints=False,
name=biz_status))# = px.box(df, x='city', y='realSum', color='week_part')
fig.update_layout(
boxmode='group',
font_color="black",
title_font_color="black",
title='Box plot of listing prices - single vs multiple listings<sup><br>Listing prices for business hosts are slightly lower than private, especially in Amsterdam. </sup>',
plot_bgcolor='rgba(0, 0, 0, 0)',
yaxis=dict(title='two nights for two (euros)',
range=[0,800],
tickformat=',.0f'),
)
fig.show()
Quick analysis of the Pearson correlation coefficients* between variables
* Pearson correlation coefficient is the ratio between the covariance of the two variables and the product of their standard deviations. So it is essentially a normalized measurement of covariance.
vars_of_interest = ['price_per_night','room_shared','bedrooms','person_capacity',
'host_is_superhost', 'multi', 'biz',
'guest_satisfaction_overall','cleanliness_rating',
'dist', 'metro_dist', 'attr_index_norm', 'rest_index_norm']
df_corr = df[vars_of_interest].corr().round(2)
mask = np.triu(np.ones_like(df_corr, dtype=bool))
df_corr_vis = df_corr.mask(mask)
fig = px.imshow(df_corr_vis, text_auto=True,
color_continuous_scale=px.colors.diverging.PRGn,
color_continuous_midpoint=0)
fig.update_layout(plot_bgcolor='rgba(0, 0, 0, 0)',
title='Pearson correlation coefficients',
xaxis=dict(tickangle=90),
height=600
)
fig.show()
# just london
df_corr = df[df['city']=='london'][vars_of_interest].corr().round(2)
mask = np.triu(np.ones_like(df_corr, dtype=bool))
df_corr_vis = df_corr.mask(mask)
fig = px.imshow(df_corr_vis, text_auto=True,
color_continuous_scale=px.colors.diverging.PRGn,
color_continuous_midpoint=0)
fig.update_layout(plot_bgcolor='rgba(0, 0, 0, 0)',
title='Pearson correlation coefficients',
xaxis=dict(tickangle=90),
height=600
)
fig.show()
These correlations are statistically significant, as shown in p-value matrix below.
df_pvals = pd.DataFrame(index=vars_of_interest)
for col_metric in vars_of_interest:
pvals_list = []
for row_metric in vars_of_interest:
pvals_list.append(pearsonr(df[col_metric],df[row_metric])[1].round(2))
df_pvals[col_metric] = pvals_list
df_pvals
df_pvals_vis = df_pvals.mask(mask)
fig = px.imshow(df_pvals_vis, text_auto=True,
color_continuous_scale=px.colors.sequential.Purples_r,
color_continuous_midpoint=0.1
)
fig.update_layout(plot_bgcolor='rgba(0, 0, 0, 0)',
title='''p-values of the Pearson correlation coefficients
<br><sup>(zeros mean that the correlations is so statistically significant that the p-value is smaller than the smallest possible floating point)</sup>''',
xaxis=dict(tickangle=90),
height=600
)
fig.show()
The factors influecing listing prices can differ across cities. For example:
fig = px.scatter(df,
x='dist',
y='price_per_night',
hover_data=['price_per_night', 'dist'],
color='city',
facet_col='city',
facet_col_wrap=5,
title='Distance from centre vs price',
trendline='ols',
height=600, width=1000
#barmode='group'
)
fig.update_layout(
font_color="black",
title_font_color="black",
plot_bgcolor='rgba(0, 0, 0, 0)',
showlegend=False)
trendlines = px.get_trendline_results(fig).set_index('city')
# remove annotation text (added below)
fig.for_each_annotation(lambda a: a.update(text=''))#f'{a.text.split("=")[-1]}'))
for j, row in enumerate(fig._grid_ref):
for i, col in enumerate(row):
k = j*5+i
city = fig['data'][k*2]['name']
xaxis = fig['data'][k*2]['xaxis']
yaxis = fig['data'][k*2]['yaxis']
x_max = np.max(fig['data'][k*2]['x'])
intercept = trendlines.loc[city]['px_fit_results'].params[0]
slope = trendlines.loc[city]['px_fit_results'].params[1]
r2 = trendlines.loc[city]['px_fit_results'].rsquared
intercept_pval = trendlines.loc[city]['px_fit_results'].pvalues[0]
slope_pval = trendlines.loc[city]['px_fit_results'].pvalues[1]
fig.add_annotation(
xref=xaxis, x=x_max,
yref=yaxis, y=700,
text=f'{city}<br>slope: {slope:.2f}<br>(p-val {slope_pval:.2f})<br>r2: {r2: .2f}',
showarrow=False,
align='right'
)
fig.update_xaxes(matches=None, tickformat='.0f', #title='distance from centre'
)
fig.update_yaxes(matches=None, tickformat='.0f', range=[0,800], #title='price per night (euros)'
)
fig.show()
fig = px.scatter(df,#[df['person_capacity']==2],
x='guest_satisfaction_overall',
y='price_per_night',
hover_data=['price_per_night', 'guest_satisfaction_overall'],
color='city',
facet_col='city',
facet_col_wrap=5,
title='Guest satisfaction vs price',
trendline='ols',
height=550, width=1000
#barmode='group'
)
fig.update_layout(
font_color="black",
title_font_color="black",
plot_bgcolor='rgba(0, 0, 0, 0)',
showlegend=False)
trendlines = px.get_trendline_results(fig).set_index('city')
# remove annotation text (added below)
fig.for_each_annotation(lambda a: a.update(text=''))#f'{a.text.split("=")[-1]}'))
for j, row in enumerate(fig._grid_ref):
for i, col in enumerate(row):
k = j*5+i
city = fig['data'][k*2]['name']
xaxis = fig['data'][k*2]['xaxis']
yaxis = fig['data'][k*2]['yaxis']
x_max = np.max(fig['data'][k*2]['x'])
intercept = trendlines.loc[city]['px_fit_results'].params[0]
slope = trendlines.loc[city]['px_fit_results'].params[1]
r2 = trendlines.loc[city]['px_fit_results'].rsquared
intercept_pval = trendlines.loc[city]['px_fit_results'].pvalues[0]
slope_pval = trendlines.loc[city]['px_fit_results'].pvalues[1]
fig.add_annotation(
xref=xaxis, x=0.5,
yref=yaxis, y=700,
text=f'{city}<br>slope: {slope:.2f}<br>(p-val {slope_pval:.2f})<br>r2: {r2: .2f}',
showarrow=False,
align='right'
)
fig.add_annotation(
xref='paper', x=.5,
yref='paper', y=-.1,
text=f'guest satisfaction',
showarrow=False,
align='right'
)
fig.update_xaxes(matches=None, tickformat='.0f', title=''
)
fig.update_yaxes(matches=None, tickformat='.0f', range=[0,800] #title='price per night (euros)'
)
fig.show()
fig = px.scatter(#df[df['person_capacity']==2],
df,
x='metro_dist',
y='guest_satisfaction_overall',
hover_data=['guest_satisfaction_overall', 'metro_dist'],
color='city',
facet_col='city',
facet_col_wrap=5,
title='Distance from metro vs guest satisfaction',
trendline='ols',
height=600, width=1000
#barmode='group'
)
fig.update_layout(
font_color="black",
title_font_color="black",
plot_bgcolor='rgba(0, 0, 0, 0)',
showlegend=False)
trendlines = px.get_trendline_results(fig).set_index('city')
# remove annotation text (added below)
fig.for_each_annotation(lambda a: a.update(text=''))#f'{a.text.split("=")[-1]}'))
for j, row in enumerate(fig._grid_ref):
for i, col in enumerate(row):
k = j*5+i
city = fig['data'][k*2]['name']
xaxis = fig['data'][k*2]['xaxis']
yaxis = fig['data'][k*2]['yaxis']
x_max = np.max(fig['data'][k*2]['x'])
intercept = trendlines.loc[city]['px_fit_results'].params[0]
slope = trendlines.loc[city]['px_fit_results'].params[1]
r2 = trendlines.loc[city]['px_fit_results'].rsquared
intercept_pval = trendlines.loc[city]['px_fit_results'].pvalues[0]
slope_pval = trendlines.loc[city]['px_fit_results'].pvalues[1]
fig.add_annotation(
xref=xaxis, x=x_max,
yref=yaxis, y=2,
text=f'{city}<br>slope: {slope:.2f}<br>(p-val {slope_pval:.2f})<br>r2: {r2: .2f}',
showarrow=False,
align='right'
)
fig.update_xaxes(matches=None, tickformat='.0f', #title='distance from centre'
)
fig.update_yaxes(matches=None, tickformat='.0f',# range=[0,800] #title='price per night (euros)'
)
fig.show()
results = px.get_trendline_results(fig)
trendlines.loc['amsterdam']['px_fit_results'].summary()
| Dep. Variable: | y | R-squared: | 0.001 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.001 |
| Method: | Least Squares | F-statistic: | 2.370 |
| Date: | Tue, 14 Mar 2023 | Prob (F-statistic): | 0.124 |
| Time: | 09:50:02 | Log-Likelihood: | -6794.8 |
| No. Observations: | 2080 | AIC: | 1.359e+04 |
| Df Residuals: | 2078 | BIC: | 1.360e+04 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | 94.2337 | 0.229 | 410.713 | 0.000 | 93.784 | 94.684 |
| x1 | 0.2577 | 0.167 | 1.539 | 0.124 | -0.071 | 0.586 |
| Omnibus: | 1818.334 | Durbin-Watson: | 1.265 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 93279.298 |
| Skew: | -3.872 | Prob(JB): | 0.00 |
| Kurtosis: | 34.880 | Cond. No. | 3.14 |
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 51707 entries, 0 to 1798 Data columns (total 25 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 realSum 51707 non-null float64 1 room_type 51707 non-null object 2 room_shared 51707 non-null bool 3 room_private 51707 non-null bool 4 person_capacity 51707 non-null float64 5 host_is_superhost 51707 non-null bool 6 multi 51707 non-null int64 7 biz 51707 non-null int64 8 cleanliness_rating 51707 non-null float64 9 guest_satisfaction_overall 51707 non-null float64 10 bedrooms 51707 non-null int64 11 dist 51707 non-null float64 12 metro_dist 51707 non-null float64 13 attr_index 51707 non-null float64 14 attr_index_norm 51707 non-null float64 15 rest_index 51707 non-null float64 16 rest_index_norm 51707 non-null float64 17 lng 51707 non-null float64 18 lat 51707 non-null float64 19 city 51707 non-null object 20 is_weekend 51707 non-null int64 21 week_part 51707 non-null object 22 price_per_night 51707 non-null float64 23 host_has_multiple_listings 51707 non-null object 24 host_is_biz 51707 non-null object dtypes: bool(3), float64(13), int64(4), object(5) memory usage: 9.2+ MB
# create dummy variables for cities
city_dummies = pd.get_dummies(df['city'], prefix='city')
# create dummy variables for room type
room_type_dummies = pd.get_dummies(df['room_type'], prefix='room_type')
df_all_features = pd.concat([df, city_dummies, room_type_dummies], axis=1)
for col in ['room_shared', 'host_is_superhost']:
df_all_features[col] = 1*df_all_features[col]
x = df_all_features[[#'room_shared',
'person_capacity', 'bedrooms', 'host_is_superhost', 'biz',
'cleanliness_rating', 'guest_satisfaction_overall',
'dist', 'metro_dist', 'attr_index_norm', 'rest_index_norm', 'is_weekend',
]+city_dummies.columns.tolist()+room_type_dummies.columns.tolist()].drop(columns=['city_lisbon', 'room_type_Shared room'])
y =np.array(df_all_features['price_per_night']).astype(float)
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
model.summary()
| Dep. Variable: | y | R-squared: | 0.241 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.241 |
| Method: | Least Squares | F-statistic: | 747.9 |
| Date: | Tue, 14 Mar 2023 | Prob (F-statistic): | 0.00 |
| Time: | 09:50:06 | Log-Likelihood: | -3.2991e+05 |
| No. Observations: | 51707 | AIC: | 6.599e+05 |
| Df Residuals: | 51684 | BIC: | 6.601e+05 |
| Df Model: | 22 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | -149.2155 | 10.910 | -13.677 | 0.000 | -170.599 | -127.832 |
| person_capacity | 12.1119 | 0.692 | 17.505 | 0.000 | 10.756 | 13.468 |
| bedrooms | 43.9391 | 1.256 | 34.996 | 0.000 | 41.478 | 46.400 |
| host_is_superhost | -0.4530 | 1.539 | -0.294 | 0.769 | -3.470 | 2.564 |
| biz | 13.9951 | 1.428 | 9.800 | 0.000 | 11.196 | 16.794 |
| cleanliness_rating | 3.1463 | 0.952 | 3.305 | 0.001 | 1.280 | 5.012 |
| guest_satisfaction_overall | 0.3462 | 0.103 | 3.346 | 0.001 | 0.143 | 0.549 |
| dist | -0.2372 | 0.489 | -0.485 | 0.628 | -1.196 | 0.722 |
| metro_dist | -4.4357 | 0.947 | -4.685 | 0.000 | -6.291 | -2.580 |
| attr_index_norm | 3.2576 | 0.113 | 28.825 | 0.000 | 3.036 | 3.479 |
| rest_index_norm | 0.0013 | 0.066 | 0.020 | 0.984 | -0.127 | 0.130 |
| is_weekend | 2.7205 | 1.262 | 2.156 | 0.031 | 0.248 | 5.193 |
| city_amsterdam | 165.3610 | 3.876 | 42.662 | 0.000 | 157.764 | 172.958 |
| city_athens | -57.4367 | 3.019 | -19.027 | 0.000 | -63.353 | -51.520 |
| city_barcelona | 41.4598 | 3.688 | 11.242 | 0.000 | 34.231 | 48.688 |
| city_berlin | 12.9153 | 4.155 | 3.109 | 0.002 | 4.772 | 21.058 |
| city_budapest | -55.2634 | 3.050 | -18.120 | 0.000 | -61.241 | -49.286 |
| city_london | 50.1832 | 3.756 | 13.361 | 0.000 | 42.821 | 57.545 |
| city_paris | 57.5275 | 3.156 | 18.229 | 0.000 | 51.342 | 63.713 |
| city_rome | -18.8338 | 2.552 | -7.381 | 0.000 | -23.835 | -13.833 |
| city_vienna | -0.2127 | 3.507 | -0.061 | 0.952 | -7.086 | 6.660 |
| room_type_Entire home/apt | 100.5635 | 7.565 | 13.293 | 0.000 | 85.736 | 115.391 |
| room_type_Private room | 42.8076 | 7.644 | 5.600 | 0.000 | 27.826 | 57.789 |
| Omnibus: | 134661.840 | Durbin-Watson: | 1.908 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 4190421196.059 |
| Skew: | 30.271 | Prob(JB): | 0.00 |
| Kurtosis: | 1396.318 | Cond. No. | 2.17e+03 |
df_all_features = pd.concat([df, city_dummies, room_type_dummies], axis=1)
for col in ['room_shared', 'host_is_superhost']:
df_all_features[col] = 1*df_all_features[col]
x = df_all_features[df_all_features['city']=='london'][[#'room_shared',
'person_capacity', 'bedrooms', 'host_is_superhost', 'biz',
'cleanliness_rating', 'guest_satisfaction_overall',
'dist', 'metro_dist', 'attr_index_norm', 'rest_index_norm', 'is_weekend',
]+room_type_dummies.columns.tolist()].drop(columns=['room_type_Shared room'])
y =np.array(df_all_features[df_all_features['city']=='london']['price_per_night']).astype(float)
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
model.summary()
| Dep. Variable: | y | R-squared: | 0.231 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.230 |
| Method: | Least Squares | F-statistic: | 230.4 |
| Date: | Tue, 14 Mar 2023 | Prob (F-statistic): | 0.00 |
| Time: | 09:50:07 | Log-Likelihood: | -67454. |
| No. Observations: | 9993 | AIC: | 1.349e+05 |
| Df Residuals: | 9979 | BIC: | 1.350e+05 |
| Df Model: | 13 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | -196.9080 | 37.294 | -5.280 | 0.000 | -270.012 | -123.804 |
| person_capacity | 17.6280 | 2.470 | 7.137 | 0.000 | 12.786 | 22.470 |
| bedrooms | 85.6694 | 4.524 | 18.936 | 0.000 | 76.801 | 94.537 |
| host_is_superhost | 9.4253 | 5.916 | 1.593 | 0.111 | -2.171 | 21.022 |
| biz | -7.8009 | 4.784 | -1.631 | 0.103 | -17.178 | 1.576 |
| cleanliness_rating | 1.1475 | 2.728 | 0.421 | 0.674 | -4.200 | 6.495 |
| guest_satisfaction_overall | 0.5227 | 0.283 | 1.846 | 0.065 | -0.032 | 1.078 |
| dist | 4.6404 | 1.660 | 2.795 | 0.005 | 1.386 | 7.895 |
| metro_dist | -11.1501 | 2.388 | -4.669 | 0.000 | -15.831 | -6.469 |
| attr_index_norm | 4.6970 | 0.461 | 10.184 | 0.000 | 3.793 | 5.601 |
| rest_index_norm | 0.0653 | 0.702 | 0.093 | 0.926 | -1.310 | 1.440 |
| is_weekend | -1.7089 | 4.163 | -0.410 | 0.681 | -9.870 | 6.452 |
| room_type_Entire home/apt | 121.7948 | 29.541 | 4.123 | 0.000 | 63.888 | 179.701 |
| room_type_Private room | 21.0085 | 29.439 | 0.714 | 0.475 | -36.699 | 78.716 |
| Omnibus: | 22537.598 | Durbin-Watson: | 1.968 |
|---|---|---|---|
| Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 155015487.623 |
| Skew: | 21.373 | Prob(JB): | 0.00 |
| Kurtosis: | 611.663 | Cond. No. | 2.42e+03 |